In [1]:
%%HTML
<link rel="stylesheet" type="text/css" href="css/custom.css">
More info at my personal page and LinkedIn profile
Source code (jupyter notebook) of these slides:
In [2]:
import pandas
titanic = pandas.read_csv('data/titanic.csv')
titanic.head()
Out[2]:
In [3]:
# basic statistics
titanic[['Age', 'Fare']].describe()
Out[3]:
In [4]:
# percentage of missing values
titanic[['Age', 'Fare']].isna().mean()
Out[4]:
In [5]:
# imputing missing values with the median
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)
titanic[['Age', 'Fare']].isna().mean()
Out[5]:
In [6]:
# plot distributions
titanic[['Age', 'Fare']].boxplot();
In [7]:
# dropping rows with 5% higher fares
titanic = titanic[titanic['Fare'] < titanic['Fare'].quantile(.95)]
titanic[['Age', 'Fare']].boxplot();
In [8]:
# plotting correlation
titanic.plot(kind='scatter', x='Age', y='Fare');
In [9]:
# compute correlation
titanic[['Age', 'Fare']].corr()
Out[9]:
In [10]:
# get dummies for categories
dummies = pandas.get_dummies(titanic[['Sex', 'Embarked']])
dummies.head()
Out[10]:
In [11]:
# prepare data for machine learning models
x = titanic[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].join(dummies)
y = titanic['Survived']
x.head()
Out[11]:
In [12]:
import sklearn.ensemble
clf = sklearn.ensemble.RandomForestClassifier()
clf.fit(x, y)
pandas.Series(clf.feature_importances_, index=x.columns).sort_values(ascending=False)
Out[12]:
In [13]:
titanic.plot(kind='scatter', x='Age', y='Fare', color=y.replace({0: 'red', 1: 'blue'}));
In [14]:
# some code resembling pandas to make the traceback look nice
import numpy
cols_from_csv = rows_from_csv = lambda fname: 10_000_000
def read_csv(fname, *args, **kwargs):
cols = cols_from_csv(fname)
rows = rows_from_csv(fname)
create_block_manager_from_arrays(rows, cols)
def create_block_manager_from_arrays(rows, cols):
float_blocks = _multi_blockify((rows, cols), numpy.float64)
def _multi_blockify(shape, dtype):
_stack_arrays(shape, dtype)
def _stack_arrays(shape, dtype):
stacked = numpy.empty(shape, dtype)
original_read_csv = pandas.read_csv
pandas.read_csv = read_csv
In [15]:
import pandas
df = pandas.read_csv('data/huge_file.csv')
In [16]:
pandas.read_csv = original_read_csv
In [17]:
original_read_csv = pandas.read_csv
class DataFrame:
def __init__(self, *args, **kwargs):
pass
@staticmethod
def sum():
import IPython.display
img = IPython.display.Image(filename='img/waiting.jpg')
IPython.display.display(img)
pandas.read_csv = DataFrame
In [18]:
import pandas
df = pandas.read_csv('data/huge_file.csv')
total = df.sum()
In [19]:
pandas.read_csv = original_read_csv
In [20]:
import pandas
titanic = pandas.read_csv('data/titanic.csv')
titanic_male = titanic[titanic['Sex'] == 'male']
titanic_male['Age'].fillna(titanic_male['Age'].median(), inplace=True)
In [21]:
import warnings
import pandas
warnings.filterwarnings('ignore')
titanic = pandas.read_csv('data/titanic.csv')
titanic_male = titanic[titanic['Sex'] == 'male']
titanic_male['Age'].fillna(titanic_male['Age'].median(), inplace=True)
In [22]:
# pandas.read_csv('data/titanic.csv')
import csv
data = []
with open('data/titanic.csv') as f:
reader = csv.reader(f)
columns = next(reader)
for row in reader:
data.append(row)
In [23]:
columns = ('Name', 'Sex', 'Age', 'Fare', 'Survived')
data = [('Montvila, Rev. Juozas', 'male', 27., 13., 0),
('Graham, Miss. Margaret Edith', 'female', 19., 30., 1),
('Johnston, Miss. Catherine Helen "Carrie"', 'female', None, 23.45, 0),
('Behr, Mr. Karl Howell', 'male', 26., 30., 1),
('Dooley, Mr. Patrick', 'male', 32., 7.75, 0)]
In [24]:
# df.iloc[3]
data[3]
Out[24]:
In [25]:
# df['Fare']
fares = list(zip(*data))[columns.index('Fare')]
fares
Out[25]:
In [26]:
# df['Fare'].mean()
mean_fare = sum(fares) / len(fares)
mean_fare
Out[26]:
In [27]:
# df[df.Sex == 'male']
male = list(filter(lambda x: x[columns.index('Sex')] == 'male', data))
male
Out[27]:
unsigned integer 4 bits:
Binary | Decimal | Binary | Decimal | ||
---|---|---|---|---|---|
0000 | 0 | 1000 | 8 | ||
0001 | 1 | 1001 | 9 | ||
0010 | 2 | 1010 | 10 | ||
0011 | 3 | 1011 | 11 | ||
0100 | 4 | 1100 | 12 | ||
0101 | 5 | 1101 | 13 | ||
0110 | 6 | 1110 | 14 | ||
0111 | 7 | 1111 | 15 |
floating point 4 bits (2 bits exponent, 2 bits precision):
Binary | Exponent | Precision | Decimal | Binary | Exponent | Precision | Decimal | ||
---|---|---|---|---|---|---|---|---|---|
00-00 | 0 | 1.00 | 1.00 | 10-00 | -2 | 1.00 | 0.010 | ||
00-01 | 0 | 1.50 | 1.50 | 10-01 | -2 | 1.50 | 0.015 | ||
00-10 | 0 | 1.25 | 1.25 | 10-10 | -2 | 1.25 | 0.013 | ||
00-11 | 0 | 1.75 | 1.75 | 10-11 | -2 | 1.75 | 0.018 | ||
01-00 | 1 | 0 | +INF | 11-00 | -1 | 1.00 | 0.100 | ||
01-01 | 1 | 1 | 0 | 11-01 | -1 | 1.50 | 0.150 | ||
01-10 | 1 | 2 | NaN | 11-10 | -1 | 1.25 | 0.125 | ||
01-11 | 1 | 3 | -INF | 11-11 | -1 | 1.75 | 0.175 |
Not all numbers can be represented
In [28]:
0.1 + 0.2
Out[28]:
In [29]:
import numpy
numpy.array([256], dtype=numpy.uint8)
Out[29]:
Special numbers exist in floating point but not integer representation: inf
, -inf
, NaN
In [30]:
import pandas
s = pandas.Series([1, 2])
print(s)
In [31]:
s.loc[0] = float('NaN')
print(s)
In [55]:
import random
size = 10_000_000
list1 = [random.random() for i in range(size)]
list2 = [random.random() for i in range(size)]
In [56]:
import numpy
import pandas
series1 = pandas.Series(list1, dtype=numpy.uint8)
series2 = pandas.Series(list2, dtype=numpy.uint8)
print('Memory consumed: {:.2f} Mb'.format(series1.memory_usage(index=False) / 1024 / 1024))
%timeit (series1 > series2).mean()
In [57]:
import numpy
import pandas
series1 = pandas.Series(list1, dtype=numpy.float64)
series2 = pandas.Series(list2, dtype=numpy.float64)
print('Memory consumed: {:.2f} Mb'.format(series1.memory_usage(index=False) / 1024 / 1024))
%timeit (series1 > series2).mean()
In [36]:
import random
import functools
import operator
size = 10_000_000
list1 = [random.random() for i in range(size)]
%timeit functools.reduce(operator.add, list1)
In [37]:
import pandas
s1 = pandas.Series(list1)
%timeit s1.sum()
In [38]:
%timeit sum(item1 > item2 for item1, item2 in zip(list1, list2)) / size
In [39]:
import pandas
series1 = pandas.Series(list1)
series2 = pandas.Series(list2)
%timeit (series1 > series2).mean()
numpy and pandas implementation are somehow similar to the machine code we've seen.
In [40]:
data = [1, 'foo', 3.141592, ['Alan', 'Dennis', 'Linus'],
{'black': '#000000', 'white': '#ffffff'}]
data
Out[40]:
Address | PyObject | |
---|---|---|
0320 | ob_refcnt=1, ob_type=list, ob_size=3, ob_item=0510, allocated=16 | |
... | ... | |
0510 | 2478 | |
0511 | 5601 | |
0512 | 4882 | |
... | ... | |
2478 | ob_refcnt=1, ob_type=int, ob_value=15 | |
... | ... | |
4882 | ob_refcnt=1, ob_type=int, ob_value=2 | |
... | ... | |
5601 | ob_refcnt=1, ob_type=int, ob_value=10 |
1) LOAD 0320
2) LOAD 0510
3) LOAD 2478 (ob_value) -> R1
4) LOAD 0511
5) LOAD 5601 (ob_value) -> R2
6) ADD R1 R2 -> R2
...
compare to:
1) LOAD 0320 -> R1 (R1=15, R2= ?)
2) LOAD 0321 -> R2 (R1=15, R2=10)
3) ADD R1 R2 -> R2 (R1=15, R2=25)
In [41]:
import pandas
pandas.Series(data)
Out[41]:
In [42]:
import pandas
series1 = pandas.Series(list1, dtype='object')
series2 = pandas.Series(list2, dtype='object')
%timeit (series1 > series2).mean()
In [43]:
import numpy
import pandas
series1 = pandas.Series(list1, dtype=numpy.float64)
series2 = pandas.Series(list2, dtype=numpy.float64)
%timeit (series1 > series2).mean()
In [44]:
import pandas
df = pandas.DataFrame({'foo': [1, 3, 7],
'bar': [.55, 1.76, 3.33],
'foobar': [109, 60, 13]},
columns=['foo', 'bar', 'foobar'])
print(df.dtypes)
df
Out[44]:
In [45]:
df._data
Out[45]:
In [46]:
df._data.blocks
Out[46]:
In [47]:
df._data.blocks[1].values
Out[47]:
In [48]:
type(df._data.blocks[1].values)
Out[48]:
In [49]:
df._data.blocks[1].values.data
Out[49]:
In [50]:
df._data.blocks[1].values.nbytes
Out[50]:
In [51]:
bytes_ = df._data.blocks[1].values.tobytes()
print(''.join('{:08b}'.format(byte) for byte in bytes_))
In [52]:
df._data.blocks[1].values.strides
Out[52]:
In [53]:
import numpy
data = numpy.empty((1000, 1000), numpy.float64)
%timeit data.sum(axis=0)
%timeit data.sum(axis=1)
In [54]:
import pandas
df = pandas.DataFrame({'foo': [1, 2, 3], 'bar': [5, 10, 15]})
df_view_or_copy = df[df.foo > 2]
df_view_or_copy['bar'] = 0 # are we modifying `df`? <- Chained indexing with unknown result: WARNING
df[df.foo > 2]['bar'] = 0 # same as before
df.loc[df.foo > 2, 'bar'] = 0 # we are modifying `df` (pandas manages __setitem__)
NaN
s in any typeobject
Join the pandas sprint tomorrow... :)
Articles:
Talks:
Floating point notation: